Using as guide: https://satijalab.org/signac/articles/seuset_multiomic.html.
## Parameters for processing dataset and metadata ##
MT_genes_file = "../scrna/genomes/genome_addons/MT_genes.txt"
# Edit plate names with substitutes:
old_col_pattern = ""
new_col_pattern = ""
# Retrieve the variables from the platenames (fields seperated with "_"):
plate_variables = c("Method", "Lineage", "Timepoint", "Prep" , "Barcode")
# Unique combined ID per plate, for visualization purposes
combined_variables_to_id = c("Method", "Lineage", "Timepoint")
## Filtering of the dataset ##
# Settings for genes
gene_tresh = 1
amount_cells_expr = 2
# Settings for cells
total_counts_tresh = 1000
total_feat_tresh = 1000
ERCC_pct_max <- 20
mt_pct_max <- 60
## Subsetting the dataset ##
# Specify a specific (sub)string to select the cells on (selection happens on the colnames)
subset_id = ""
# Select the type of filtering: keep cells with the substring (set "in") or remove (set "out")
# if NO filtering is wanted, leave empty (set "" or "no")
filtering = "no"
## Seurat Normalization, HVG selection (vst) & Scaling (and Regression) ##
nHVG = 2000
pc_set = 30
res_set = 0.5
# Regression performed on the following variables:
vars_to_regress = c("nCount_sf", "nFeature_sf") # If no regression desired: NULL
## Dimensionality reduction ##
# For PCA to run on
pcs_max = 70
# PCs used for different UMAP representations
pcs_for_overview = c(5, 10, 20, 25, 30, 32, 35, 37, 40, 50)
## Visualization ##
# label from phenodata to color in Scater and Seurat plots
lab_col = "combined_id"
umap_col = "seurat_clusters"
umap_col2 = "Library"
# Checking variability explained by confounding factors
confounders_to_test = c("combined_id","Lineage","Timepoint")
# Marker genes for violin plots
explore_violin = c("NKX2-5", "MYL7")
## Loading 10X files ##
rna_whitelist <- "../cellranger-software/cellranger-arc-2.0.1/lib/python/cellranger/barcodes/737K-arc-v1.txt.gz"
atac_whitelist <- "../cellranger-software/cellranger-arc-2.0.1/lib/python/atac/barcodes/737K-arc-v1.txt.gz"
h5_10X_file = "/scratch/snabel/scrna/CMdiff_10XMO_EB_d8_GRCh38/10XMO_EB_d8_combined/outs/raw_feature_bc_matrix.h5"
frag_path <- "/scratch/snabel/scrna/CMdiff_10XMO_EB_d8_GRCh38/10XMO_EB_d8_combined/outs/atac_fragments.tsv.gz"
barcodes_file_AM <- "../scrna/CMdiff_10XMO_EB_d8/test_cellranger_ATAC/EB_AM_d8_combinedfc/outs/analysis/umap/2_components/projection.csv"
barcodes_file_VM <- "../scrna/CMdiff_10XMO_EB_d8/test_cellranger_ATAC/EB_VM_d8_combinedfc/outs/analysis/umap/2_components/projection.csv"
## Storing results ##
workdir <- "/scratch/snabel/scrna/CMdiff_10XMO_EB_d8_hg38/R_analysis/"
# if regression is performed: this will already be included in the folder name
result_descript = "_results_stringent_preprocess_PeakcallRNAClusters_keeptmp"
## Location of scripts ##
#source("../R-scripts/scRNA-seq/read_cellranger_dataset_10X.R")
system(paste("mkdir -p ", workdir))
knitr::opts_knit$set(root.dir = normalizePath(workdir))
counts <- Read10X_h5(h5_10X_file)
## Warning in sparseMatrix(i = indices[] + 1, p = indptr[], x = as.numeric(x =
## counts[]), : 'giveCsparse' has been deprecated; setting 'repr = "T"' for you
## Genome matrix has multiple modalities, returning a list of matrices for this genome
dateoftoday <- gsub("-", "", as.character(Sys.Date()))
resultsdir <- paste0(workdir, dateoftoday, result_descript)
system(paste("mkdir -p ", resultsdir))
knitr::opts_knit$set(root.dir = normalizePath(resultsdir))
seuset <- CreateSeuratObject(
counts = counts$`Gene Expression`,
assay = "RNA"
)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
rna_bc <- as.data.frame(read.table(gzfile(rna_whitelist),sep="\t"))
atac_bc <- as.data.frame(read.table(gzfile(atac_whitelist),sep="\t"))
rna_bc <- data.frame(c(paste0(rna_bc$V1, "-1"), paste0(rna_bc$V1, "-2")))
atac_bc <- data.frame(c(paste0(atac_bc$V1, "-1"), paste0(atac_bc$V1, "-2")))
multiome_bc <- cbind(rna_bc,atac_bc)
rm(rna_bc)
rm(atac_bc)
colnames(multiome_bc) <- c("RNA","ATAC")
head(multiome_bc)
## shorten barcode table for barcodes present in the dataset
multiome_bc <- multiome_bc[multiome_bc$RNA %in% colnames(seuset),]
rownames(multiome_bc) <- multiome_bc$RNA
multiome_bc <- multiome_bc[colnames(seuset),]
identical(rownames(multiome_bc), colnames(seuset))
## [1] TRUE
VM_BC <- read.csv(barcodes_file_VM)$Barcode
AM_BC <- read.csv(barcodes_file_AM)$Barcode
AM_BC <- gsub("-1","-2",AM_BC)
atac_cells <- c(VM_BC, AM_BC)
multiome_bc$healthy_atac <- FALSE
multiome_bc$healthy_atac[multiome_bc$ATAC %in% atac_cells] <- TRUE
table(multiome_bc$healthy_atac)
##
## FALSE TRUE
## 1353843 26549
table(multiome_bc$ATAC %in% atac_cells)
##
## FALSE TRUE
## 1353843 26549
table(atac_cells %in% multiome_bc$ATAC)
##
## TRUE
## 26549
identical(rownames(multiome_bc), colnames(seuset))
## [1] TRUE
seuset@meta.data$ATAC_id <- multiome_bc$ATAC
seuset@meta.data$healthy_atac <- multiome_bc$healthy_atac
seuset_healthy <- seuset[, seuset$healthy_atac == "TRUE"]
seuset_non_healthy <- seuset[, seuset$healthy_atac == "FALSE"]
VlnPlot(seuset, c("nCount_RNA"), group.by = "healthy_atac")
seuset_nonfiltered <- seuset
seuset <- seuset_healthy
hist(seuset_nonfiltered$nCount_RNA, breaks = 100)
hist(seuset$nCount_RNA, breaks = 100)
seuset@meta.data$Library <- data.frame(str_split_fixed(colnames(seuset), pattern = "-", n = 2))[,2]
seuset@meta.data$Lineage <- NA
seuset@meta.data$Lineage[seuset@meta.data$Library == "1"] <- "VM"
seuset@meta.data$Lineage[seuset@meta.data$Library == "2"] <- "AM"
annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
seqlevelsStyle(annotation) <- "UCSC"
seuset[["ATAC"]] <- CreateChromatinAssay(
counts = counts$Peaks[,colnames(seuset)],
sep = c(":", "-"),
fragments = frag_path,
annotation = annotation
)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Computing hash
seuset
## An object of class Seurat
## 427490 features across 26549 samples within 2 assays
## Active assay: RNA (62750 features, 0 variable features)
## 1 other assay present: ATAC
seuset@meta.data$Library <- data.frame(str_split_fixed(colnames(seuset), pattern = "-", n = 2))[,2]
seuset@meta.data$Lineage <- NA
seuset@meta.data$Lineage[seuset@meta.data$Library == "1"] <- "VM"
seuset@meta.data$Lineage[seuset@meta.data$Library == "2"] <- "AM"
DefaultAssay(seuset) <- "ATAC"
# Strength of nucleosome signal per cell
# Ratio of fragments 147-294bp and <147 np (nucleosome free)
seuset <- NucleosomeSignal(seuset)#, n = 100000000)
# TSS enrichent is a signal to noise calculation
# Reads around reference set of TSS are collected in the region 2000bp up and down stream of a TSS
seuset <- TSSEnrichment(seuset, fast = FALSE)#, n = 1000000)
## Extracting TSS positions
## Finding + strand cut sites
## Finding - strand cut sites
## Computing mean insertion frequency in flanking regions
## Normalizing TSS score
seuset$nucleosome_group <- ifelse(seuset$nucleosome_signal > 4, 'NS > 4', 'NS < 4')
FragmentHistogram(object = seuset, group.by = 'nucleosome_group', region = 'chr1-1-10000000')
## Warning: Removed 218 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
seuset$high.tss <- ifelse(seuset$TSS.enrichment > 2, 'High', 'Low')
TSSPlot(seuset, group.by = 'high.tss') + NoLegend()
VlnPlot(
object = seuset,
features = c("nCount_RNA", "nCount_ATAC", "TSS.enrichment", "nucleosome_signal"),
ncol = 4,
pt.size = 0,
split.by = "Library"
)
## The default behaviour of split.by has changed.
## Separate violin plots are now plotted side-by-side.
## To restore the old behaviour of a single split violin,
## set split.plot = TRUE.
##
## This message will be shown once per session.
VlnPlot(
object = seuset,
features = c("nCount_RNA", "nCount_ATAC", "TSS.enrichment", "nucleosome_signal"),
ncol = 4
)
total_RNA <- (seuset@meta.data$nCount_RNA[order(seuset@meta.data$nCount_RNA, decreasing = TRUE)])
barplot(total_RNA)
total_fragments <- seuset@meta.data$nCount_ATAC[order(seuset@meta.data$nCount_ATAC, decreasing = TRUE)]
barplot(total_fragments)
DefaultAssay(seuset) <- "RNA"
seuset[["percent.mt"]] <- PercentageFeatureSet(seuset, pattern = "^MT-")
rb_genes <- rownames(seuset[grep("RP[SL]",rownames(seuset)),])
seuset[["percent.rb"]] <- PercentageFeatureSet(seuset, pattern = "RP[SL]")
VlnPlot(seuset, features = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent.rb"), ncol = 3, assay = "RNA")
plot1 <- FeatureScatter(seuset, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "Library")
plot2 <- FeatureScatter(seuset, feature1 = "nCount_RNA", feature2 = "percent.rb", group.by = "Library")
plot3 <- FeatureScatter(seuset, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "Library")
CombinePlots(plots = list(plot1, plot2, plot3))
## Warning: CombinePlots is being deprecated. Plots should now be combined using
## the patchwork system.
saveRDS(seuset, "seuset_10XMO_d8_unfiltered.rds")
#seuset <- readRDS("seuset_10XMO_d8_unfiltered.rds")
seuset_raw <- seuset
# filter out low quality cells
seuset <- subset(
x = seuset,
subset = nCount_ATAC < 20000 &
nCount_RNA < 10000 &
nCount_ATAC > 5000 &
nCount_RNA > 1000 &
nucleosome_signal < 1 &
## TSS enrichment of 4 is a stringent threshold used in ArchR
TSS.enrichment > 4 &
percent.mt < 20
)
seuset
## An object of class Seurat
## 427490 features across 17027 samples within 2 assays
## Active assay: RNA (62750 features, 0 variable features)
## 1 other assay present: ATAC
#hist(seuset_nonfiltered$nCount_RNA, breaks = 100)
hist(seuset$nCount_RNA, breaks = 100)
hist(seuset$nFeature_RNA, breaks = 100)
#hist(seuset_nonfiltered$nCount_ATAC, breaks = 100)
hist(seuset$nCount_ATAC, breaks = 100)
hist(seuset$nFeature_ATAC, breaks = 100)
total_RNA <- (seuset@meta.data$nCount_RNA[order(seuset@meta.data$nCount_RNA, decreasing = TRUE)])
barplot(total_RNA)
total_fragments <- seuset@meta.data$nCount_ATAC[order(seuset@meta.data$nCount_ATAC, decreasing = TRUE)]
barplot(total_fragments)
VlnPlot(
object = seuset,
features = c("nCount_RNA", "nCount_ATAC", "TSS.enrichment", "nucleosome_signal"),
ncol = 4,
pt.size = 0,
split.by = "Library"
)
DefaultAssay(seuset) <- "RNA"
VlnPlot(seuset, features = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent.rb"), ncol = 3, assay = "RNA")
plot1 <- FeatureScatter(seuset, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "Library")
plot2 <- FeatureScatter(seuset, feature1 = "nCount_RNA", feature2 = "percent.rb", group.by = "Library")
plot3 <- FeatureScatter(seuset, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "Library")
CombinePlots(plots = list(plot1, plot2, plot3))
## Warning: CombinePlots is being deprecated. Plots should now be combined using
## the patchwork system.
DefaultAssay(seuset) <- "RNA"
seuset <- NormalizeData(seuset)
seuset <- FindVariableFeatures(seuset, selection.method = "vst", nfeatures = nHVG)
VariableFeaturePlot(seuset)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 22973 rows containing missing values (geom_point).
seuset <- ScaleData(seuset, features = rownames(seuset))
## Centering and scaling data matrix
seuset <- RunPCA(seuset, features = VariableFeatures(seuset), reduction.name = "RNA_pca")
## PC_ 1
## Positive: KCNQ5, SLC8A1, MYH6, NEBL, EMC10, ENSG00000268518, PALM2AKAP2, MYL7, LINC01099, MYL4
## TTN, MYOCD, FILIP1, SORBS2, PALLD, ACTC1, CACNA1C, ENSG00000254202, HPSE2, TAFA4
## TRDN, PDLIM5, INTS6L, ENSG00000289950, FHOD3, MYO18B, ENSG00000289973, PPP1R12B, CACNB2, RYR2
## Negative: DACH2, KCNJ3, NR2F2-AS1, CASC15, DACH1, FLRT2, COL11A1, LRP1B, TECRL, PDZRN4
## NR2F1-AS1, LSAMP, NR2F2, DCC, KCTD8, MDK, NR2F1, ROBO2, SYNE2, LINC01933
## SHROOM3, ENSG00000239268, PCDH9, CTNNA2, DPP6, FAT3, KHDRBS2, CLSTN2, VEGFC, KCNT2
## PC_ 2
## Positive: FLI1, XACT, NRG3, RASGEF1B, RAPGEF5, CALCRL, SNTB1, LIMCH1, VAV3, PTPRB
## RASGRP3, TEK, TFPI, RNF128, LINC02955, CMTM8, HNF1A-AS1, LDB2, ERG, APOB
## CUBN, CD34, ADGRL4, ST8SIA4, FAR2, THRB, EMCN, ENSG00000227400, MTTP, KDR
## Negative: RYR2, SLIT2, KCNH7, HCN1, TECRL, CTNNA3, UNC5C, ADAMTS6, ANGPT1, LRP1B
## MAGI2, CACNA1D, SEMA5A, MLLT3, CLSTN2, KCNIP4, CCDC141, PDLIM5, KCNJ3, TTN
## GUCY1A1, DACH2, COL11A1, NRXN1, NR2F2-AS1, TBX20, CMYA5, TSHZ2, VEGFC, FHOD3
## PC_ 3
## Positive: SNTB1, RNF128, HNF1A-AS1, APOB, ENSG00000227400, CNKSR2, MTTP, LINC02955, APOA1, A1CF
## CUBN, THRB, LINC00348, PIK3C2G, COBL, TTR, LYPD6B, ESRP1, AKR1D1, EPHA6
## PRDM1, APOA2, TRPC5, GHR, GLYATL2, RFX6, CAMK2D, SCN7A, ENSG00000284418, KANK4
## Negative: FLI1, CALCRL, NRG3, PTPRB, LDB2, VAV3, RASGRP3, TEK, ERG, CD34
## EMCN, ADGRL4, EGFL7, ETS1, PLVAP, KDR, LIMCH1, ENSG00000286251, CFAP161, TFPI
## RHOJ, PTCHD4, PLEKHG1, HAPLN1, PECAM1, SAMSN1, DYSF, CDH5, FLT1, TCIM
## PC_ 4
## Positive: MT-CO3, MT-RNR2, MT-CO2, MT-ATP6, MT-CYB, MT-RNR1, MT-CO1, MT-ND4, MT-ND1, MT-ND2
## MT-ND3, MT-ND5, MTATP6P1, MT-ATP8, HSP90AB1, MT-ND4L, APOE, COL1A1, APOA1, MDK
## MT-ND6, APOA2, APOB, COL1A2, COL6A1, TNNI1, RBP4, SCD, KRT18, TTR
## Negative: DIAPH3, LRRTM4, BRIP1, NALF1, CTNNA3, POLQ, MECOM, NRXN3, DLEU2, CEP128
## LINC02511, MID1, ENSG00000225689, CENPP, DLG2, ASPM, SCLT1, PTPRM, MMS22L, ARHGAP6
## KIF18B, DEPDC1B, RYR2, KCNH7, HCN1, ERBB4, UNC5C, ECT2, ADAMTS6, ADAMTS19
## PC_ 5
## Positive: ANGPT1, KCNH7, HCN1, TECRL, MAGI2, ADAMTS6, CTNNA3, MLLT3, RABGAP1L, AKR1D1
## CUBN, RYR2, SLIT2, CACNA1D, SEMA5A, KHDRBS2, PDLIM5, A1CF, PLCL1, APOA1
## RBP4, MTTP, ERBB4, CLSTN2, APOB, KANK4, RASGEF1B, FLRT2, RNF128, NR2F2-AS1
## Negative: NRXN3, DIAPH3, ASPM, DGKB, POLQ, BRIP1, KIF18B, RIMS2, KIF4A, EBF2
## TOP2A, NCAPG, CENPE, CIT, CDC25C, BUB1, DEPDC1B, KIF18A, KIF14, CDCA2
## ECT2, RTKN2, CA10, NDC80, RRM2, LINC00698, KIF15, ADAMTS18, BUB1B, ANLN
seuset <- FindNeighbors(seuset, dims = 1:pc_set, reduction = "RNA_pca")
## Computing nearest neighbor graph
## Computing SNN
## using a coarse clustering
seuset <- FindClusters(seuset, dims = 1:pc_set, resolution = res_set)
## Warning: The following arguments are not used: dims
## Warning: The following arguments are not used: dims
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 17027
## Number of edges: 574548
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8766
## Number of communities: 10
## Elapsed time: 2 seconds
seuset <- RunUMAP(seuset, dims = 1:pc_set, reduction = "RNA_pca", reduction.name = "RNA_umap")
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 07:12:06 UMAP embedding parameters a = 0.9922 b = 1.112
## 07:12:06 Read 17027 rows and found 30 numeric columns
## 07:12:06 Using Annoy for neighbor search, n_neighbors = 30
## 07:12:06 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 07:12:08 Writing NN index file to temp file /scratch/snabel/tmp/3sIR/RtmpjimgQc/file37d2c71cb90f48
## 07:12:08 Searching Annoy index using 1 thread, search_k = 3000
## 07:12:13 Annoy recall = 100%
## 07:12:20 Commencing smooth kNN distance calibration using 1 thread
## 07:12:26 Initializing from normalized Laplacian + noise
## 07:12:27 Commencing optimization for 200 epochs, with 738754 positive edges
## 07:12:37 Optimization finished
p1_rna <- DimPlot(seuset, group.by = "seurat_clusters", reduction = "RNA_umap")
DimPlot(seuset, group.by = "seurat_clusters", reduction = "RNA_umap")
DimPlot(seuset, reduction = "RNA_umap", group.by = "seurat_clusters", label = TRUE)
DimPlot(seuset, reduction = "RNA_umap", group.by = "Library")
FeaturePlot(seuset, reduction = "RNA_umap", "nCount_RNA")
# split dataset into a list of seurat objects
seuset.list <- SplitObject(seuset, split.by = "Library")
# normalize and identify variable features for each dataset independently
seuset.list <- lapply(X = seuset.list, FUN = function(x) {
DefaultAssay(x) <- "RNA"
x <- NormalizeData(x)
x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})
Reductions(seuset)
## [1] "RNA_pca" "RNA_umap"
# select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = seuset.list)
seuset.anchors <- FindIntegrationAnchors(object.list = seuset.list, anchor.features = features)
## Scaling features for provided objects
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 24329 anchors
## Filtering anchors
## Retained 2868 anchors
seuset.int.RNA <- IntegrateData(anchorset = seuset.anchors, new.assay.name = "integrated_RNA")
## Merging dataset 2 into 1
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Warning: Keys should be one or more alphanumeric characters followed by an
## underscore, setting key from integrated_rna_ to integratedrna_
## Warning: Keys should be one or more alphanumeric characters followed by an
## underscore, setting key from integrated_rna_ to integratedrna_
## Warning: Keys should be one or more alphanumeric characters followed by an
## underscore, setting key from integrated_rna_ to integratedrna_
seuset[["integrated_RNA"]] <- CreateAssayObject(GetAssayData(seuset.int.RNA[["integrated_RNA"]], slot = "data"))
## Warning: Keys should be one or more alphanumeric characters followed by an
## underscore, setting key from integrated_rna_ to integratedrna_
DefaultAssay(seuset) <- "integrated_RNA"
seuset <- ScaleData(seuset)
## Centering and scaling data matrix
seuset <- RunPCA(seuset, assay = "integrated_RNA", reduction.name = "integrated_RNA_pca", features = rownames(seuset))
## PC_ 1
## Positive: RFX6, HNF1A-AS1, ST8SIA4, XACT, FLI1, CALCRL, LIMCH1, LDB2, NRG3, VAV3
## RASGEF1B, RASGRP3, DENND2C, PTPRB, TFPI, FGF13, EMCN, TEK, SNTB1, TTC6
## TTR, RNF128, RAPGEF5, ONECUT2, GASK1B, LAMB1, CMTM8, FAR2, EGFL7, ERG
## Negative: RYR2, DACH2, TTN, TECRL, LRP1B, HCN1, UNC5C, ANGPT1, ADAMTS6, CTNNA3
## KCNJ3, KCNH7, COL11A1, CLSTN2, SLIT2, CACNA1D, VEGFC, SEMA5A, TGFB2, PDLIM5
## MAGI2, VWDE, LSAMP, MLLT3, CDH4, PDZRN4, TBX20, NRXN1, SLC8A1, KCNIP4
## PC_ 2
## Positive: RFX6, HNF1A-AS1, FGF13, TTC6, CNKSR2, RNF128, BMPR1B, DENND2C, TRPC4, ONECUT2
## COL19A1, SNTB1, TTR, EXPH5, CAMK2D, MTTP, CDH6, SCN7A, PRDM1, HNF1B
## NEDD4L, MIR31HG, LINC00261, FHIP1A, C8orf34, GLDC, GLYATL2, LYPD6B, EML5, ZBTB16
## Negative: FLI1, CALCRL, LDB2, EMCN, RASGRP3, PTPRB, TEK, VAV3, NRG3, KDR
## CD34, ETS1, LIMCH1, EGFL7, PTCHD4, HAPLN1, NPR3, XACT, ERG, CFAP161
## ADGRL4, TFPI, PLVAP, CALCRL-AS1, SHE, PECAM1, F2RL2, PLEKHG1, TFPI2, PPP1R16B
## PC_ 3
## Positive: LSAMP, DIAPH3, GPC5, PRKG1, DACH1, ENSG00000239268, LRP1B, FAT3, FBXL7, FLRT2
## HCN1, BRIP1, NALF1, MECOM, ENSG00000225689, ADGRL3, LINC01440, LRRTM4, PDZRN4, POLQ
## MMS22L, PBX3, DLEU2, CASC15, KCNH7, PTPRM, VAV3, NHS, EFNA5, LINGO2
## Negative: MT-CO2, MT-CO3, MT-ATP6, MT-RNR2, MT-RNR1, MT-CO1, MT-CYB, MT-ND1, MT-ND4, MT-ND3
## MT-ND2, MT-ND5, MTATP6P1, HSP90AB1, COL1A1, MT-ATP8, FTL, FN1, APOE, MT-ND4L
## FTH1, TTR, APOA1, MARCKS, COL1A2, FGB, APOB, AFP, APOA2, SFRP1
## PC_ 4
## Positive: CTNNA3, TTN, RYR2, CACNA1D, PDLIM5, ANGPT1, CLSTN2, KCNIP4, KHDRBS2, SLC8A1
## ADAMTS6, SLIT2, PLCL2, HCN1, PPP1R12B, MAGI2, SORBS2, FILIP1, NEBL, KCNH7
## MEF2C, TECRL, CUBN, ERBB4, EMC10, SEMA5A, MLLT3, ST8SIA4, AFP, RABGAP1L
## Negative: ASPM, CENPE, CDCA2, DIAPH3, TOP2A, CENPF, KIF18A, BRIP1, KIF14, NCAPG
## ECT2, BUB1B, TPX2, KIF18B, KIF4A, KIF15, KIF23, GTSE1, BUB1, CDC25C
## NUF2, NDC80, GAS2L3, ANLN, POLQ, KIF11, CIP2A, DEPDC1B, RTKN2, KIF20B
## PC_ 5
## Positive: MT-CO1, MT-CO2, MT-CYB, MT-CO3, MT-RNR1, MT-ND1, MT-ND4, MT-ATP6, MT-RNR2, MT-ND3
## MT-ND2, MT-ND5, MTATP6P1, MT-ATP8, SFRP1, FN1, FLRT2, ROBO2, UNC5C, MT-ND4L
## HSP90AB1, LSAMP, PBX3, COL3A1, ASPM, LRP1B, TECRL, CENPF, CENPE, PDE3A
## Negative: LINC00698, LINC01090, SNTG1, CGA, ENSG00000223786, ENSG00000262198, UPP1, MIR4422HG, ENSG00000262801, NGFR
## SGCG, NUDT16L2P, ITGA2, ANXA3, RIMS2, FKTN-AS1, ENSG00000214788, GRHL2, PLCXD3, CDH1
## ACOXL, FAM83B, TAC1, LINC01889, ENSG00000268518, HS3ST4, ARHGEF3, XACT, MACC1, GNGT1
## Warning: Cannot add objects with duplicate keys (offending key: PC_), setting
## key to 'integrated_rna_pca_'
seuset <- RunUMAP(seuset, assay = "integrated_RNA", reduction = "integrated_RNA_pca",
dims = 1:pc_set,
reduction.name = "integrated_RNA_umap")
## 07:17:06 UMAP embedding parameters a = 0.9922 b = 1.112
## 07:17:06 Read 17027 rows and found 30 numeric columns
## 07:17:06 Using Annoy for neighbor search, n_neighbors = 30
## 07:17:06 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 07:17:07 Writing NN index file to temp file /scratch/snabel/tmp/3sIR/RtmpjimgQc/file37d2c71d72cdf3
## 07:17:07 Searching Annoy index using 1 thread, search_k = 3000
## 07:17:12 Annoy recall = 100%
## 07:17:16 Commencing smooth kNN distance calibration using 1 thread
## 07:17:22 Initializing from normalized Laplacian + noise
## 07:17:22 Commencing optimization for 200 epochs, with 739968 positive edges
## 07:17:32 Optimization finished
## Warning: Cannot add objects with duplicate keys (offending key: UMAP_), setting
## key to 'integrated_rna_umap_'
seuset <- FindNeighbors(seuset, reduction = "integrated_RNA_pca", dims = 1:pc_set)
## Computing nearest neighbor graph
## Computing SNN
seuset <- FindClusters(seuset, resolution = res_set)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 17027
## Number of edges: 555237
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8476
## Number of communities: 9
## Elapsed time: 2 seconds
p2_rna <- DimPlot(seuset, group.by = "seurat_clusters", label = TRUE)
DimPlot(seuset, reduction = "integrated_RNA_umap", group.by = "Library")
DimPlot(seuset, reduction = "integrated_RNA_umap", group.by = "seurat_clusters", label = TRUE)
FeaturePlot(seuset, reduction = "integrated_RNA_umap", "nCount_RNA")
(p1_rna + ggtitle("RNA")) | (p2_rna + ggtitle("RNA Integrated"))
DefaultAssay(seuset) <- "RNA"
FeaturePlot(seuset, reduction = "RNA_umap", c("COL3A1", "TNNT2", "NR2F2", "MYH7", "NMU", "PBX3"))
FeaturePlot(seuset, reduction = "integrated_RNA_umap", c("COL3A1", "TNNT2", "NR2F2", "MYH7", "NMU", "PBX3"))
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
##
## Matrix products: default
## BLAS/LAPACK: /vol/mbconda/snabel/anaconda3/envs/kb_scrna_R_signac_wf/lib/libopenblasp-r0.3.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ComplexHeatmap_2.10.0 BiocParallel_1.28.3
## [3] TFBSTools_1.32.0 JASPAR2020_0.99.10
## [5] BSgenome.Hsapiens.UCSC.hg38_1.4.4 BSgenome_1.62.0
## [7] rtracklayer_1.54.0 Biostrings_2.62.0
## [9] XVector_0.34.0 stringr_1.4.0
## [11] EnsDb.Hsapiens.v86_2.99.0 ensembldb_2.18.1
## [13] AnnotationFilter_1.18.0 GenomicFeatures_1.46.1
## [15] GenomicRanges_1.46.1 GenomeInfoDb_1.30.0
## [17] clusterProfiler_4.2.0 org.Hs.eg.db_3.14.0
## [19] AnnotationDbi_1.56.1 IRanges_2.28.0
## [21] S4Vectors_0.32.3 Biobase_2.54.0
## [23] BiocGenerics_0.40.0 Signac_1.7.0
## [25] clustree_0.5.0 ggraph_2.0.5
## [27] plot3D_1.4 RColorBrewer_1.1-3
## [29] sp_1.5-0 SeuratObject_4.1.0
## [31] Seurat_4.0.3 knitr_1.39
## [33] tidyr_1.2.0 dplyr_1.0.9
## [35] ggplot2_3.3.6 devtools_2.4.3
## [37] usethis_2.1.6
##
## loaded via a namespace (and not attached):
## [1] Hmisc_4.7-0 ica_1.0-3
## [3] RcppRoll_0.3.0 ps_1.7.1
## [5] Rsamtools_2.10.0 foreach_1.5.2
## [7] lmtest_0.9-40 crayon_1.5.1
## [9] spatstat.core_2.4-4 MASS_7.3-57
## [11] nlme_3.1-158 backports_1.4.1
## [13] qlcMatrix_0.9.7 GOSemSim_2.20.0
## [15] rlang_1.0.3 ROCR_1.0-11
## [17] irlba_2.3.5 callr_3.7.0
## [19] limma_3.50.1 filelock_1.0.2
## [21] rjson_0.2.21 CNEr_1.30.0
## [23] bit64_4.0.5 glue_1.6.2
## [25] poweRlaw_0.70.6 sctransform_0.3.3
## [27] chromVAR_1.16.0 parallel_4.1.0
## [29] processx_3.7.0 spatstat.sparse_2.1-1
## [31] tcltk_4.1.0 DOSE_3.20.0
## [33] spatstat.geom_2.4-0 tidyselect_1.1.2
## [35] SummarizedExperiment_1.24.0 fitdistrplus_1.1-8
## [37] XML_3.99-0.9 zoo_1.8-10
## [39] GenomicAlignments_1.30.0 xtable_1.8-4
## [41] magrittr_2.0.3 evaluate_0.15
## [43] cli_3.3.0 zlibbioc_1.40.0
## [45] rstudioapi_0.13 miniUI_0.1.1.1
## [47] bslib_0.3.1 rpart_4.1.16
## [49] fastmatch_1.1-3 treeio_1.18.0
## [51] shiny_1.7.1 xfun_0.31
## [53] clue_0.3-60 pkgbuild_1.3.1
## [55] cluster_2.1.3 caTools_1.18.2
## [57] tidygraph_1.2.1 KEGGREST_1.34.0
## [59] tibble_3.1.7 ggrepel_0.9.1
## [61] biovizBase_1.42.0 ape_5.6-2
## [63] listenv_0.8.0 TFMPvalue_0.0.8
## [65] png_0.1-7 future_1.26.1
## [67] withr_2.5.0 bitops_1.0-7
## [69] slam_0.1-50 ggforce_0.3.3
## [71] plyr_1.8.7 sparsesvd_0.2
## [73] pracma_2.3.8 pillar_1.7.0
## [75] GlobalOptions_0.1.2 cachem_1.0.6
## [77] fs_1.5.2 hdf5r_1.3.5
## [79] GetoptLong_1.0.5 vctrs_0.4.1
## [81] ellipsis_0.3.2 generics_0.1.3
## [83] tools_4.1.0 foreign_0.8-82
## [85] munsell_0.5.0 tweenr_1.0.2
## [87] fgsea_1.20.0 DelayedArray_0.20.0
## [89] fastmap_1.1.0 compiler_4.1.0
## [91] pkgload_1.3.0 abind_1.4-5
## [93] httpuv_1.6.5 sessioninfo_1.2.2
## [95] plotly_4.10.0 rgeos_0.5-9
## [97] GenomeInfoDbData_1.2.7 gridExtra_2.3
## [99] lattice_0.20-45 deldir_1.0-6
## [101] utf8_1.2.2 later_1.2.0
## [103] BiocFileCache_2.2.0 jsonlite_1.8.0
## [105] scales_1.2.0 docopt_0.7.1
## [107] tidytree_0.3.9 pbapply_1.5-0
## [109] nabor_0.5.0 lazyeval_0.2.2
## [111] promises_1.2.0.1 doParallel_1.0.17
## [113] R.utils_2.12.0 latticeExtra_0.6-30
## [115] goftest_1.2-3 spatstat.utils_2.3-1
## [117] reticulate_1.25 checkmate_2.1.0
## [119] rmarkdown_2.14 cowplot_1.1.1
## [121] Rtsne_0.16 dichromat_2.0-0.1
## [123] downloader_0.4 uwot_0.1.11
## [125] igraph_1.3.0 survival_3.3-1
## [127] yaml_2.3.5 htmltools_0.5.2
## [129] memoise_2.0.1 VariantAnnotation_1.40.0
## [131] BiocIO_1.4.0 graphlayouts_0.8.0
## [133] viridisLite_0.4.0 digest_0.6.29
## [135] assertthat_0.2.1 mime_0.12
## [137] rappdirs_0.3.3 RSQLite_2.2.8
## [139] yulab.utils_0.0.5 future.apply_1.9.0
## [141] misc3d_0.9-1 remotes_2.4.2
## [143] data.table_1.14.2 blob_1.2.3
## [145] R.oo_1.25.0 splines_4.1.0
## [147] Formula_1.2-4 labeling_0.4.2
## [149] ProtGenerics_1.26.0 RCurl_1.98-1.7
## [151] hms_1.1.1 colorspace_2.0-3
## [153] base64enc_0.1-3 BiocManager_1.30.18
## [155] shape_1.4.6 aplot_0.1.6
## [157] nnet_7.3-17 sass_0.4.1
## [159] Rcpp_1.0.9 RANN_2.6.1
## [161] circlize_0.4.15 enrichplot_1.14.1
## [163] ggseqlogo_0.1 fansi_1.0.3
## [165] tzdb_0.3.0 parallelly_1.32.0
## [167] R6_2.5.1 ggridges_0.5.3
## [169] lifecycle_1.0.1 curl_4.3.2
## [171] motifmatchr_1.16.0 leiden_0.4.2
## [173] jquerylib_0.1.4 DO.db_2.9
## [175] Matrix_1.4-1 qvalue_2.26.0
## [177] RcppAnnoy_0.0.19 iterators_1.0.14
## [179] htmlwidgets_1.5.4 polyclip_1.10-0
## [181] biomaRt_2.50.0 purrr_0.3.4
## [183] shadowtext_0.1.2 gridGraphics_0.5-1
## [185] seqLogo_1.60.0 mgcv_1.8-40
## [187] globals_0.15.1 htmlTable_2.4.1
## [189] patchwork_1.1.1 spatstat.random_2.2-0
## [191] progressr_0.10.1 codetools_0.2-18
## [193] matrixStats_0.62.0 GO.db_3.14.0
## [195] gtools_3.9.3 prettyunits_1.1.1
## [197] dbplyr_2.2.1 RSpectra_0.16-1
## [199] R.methodsS3_1.8.2 gtable_0.3.0
## [201] DBI_1.1.3 ggfun_0.0.6
## [203] tensor_1.5 httr_1.4.3
## [205] highr_0.9 KernSmooth_2.23-20
## [207] stringi_1.7.6 progress_1.2.2
## [209] reshape2_1.4.4 farver_2.1.1
## [211] annotate_1.72.0 viridis_0.6.2
## [213] ggtree_3.2.0 DT_0.23
## [215] xml2_1.3.3 rvcheck_0.1.8
## [217] restfulr_0.0.15 readr_2.1.2
## [219] interp_1.1-2 ggplotify_0.1.0
## [221] scattermore_0.8 bit_4.0.4
## [223] scatterpie_0.1.6 jpeg_0.1-9
## [225] MatrixGenerics_1.6.0 spatstat.data_2.2-0
## [227] pkgconfig_2.0.3 DirichletMultinomial_1.36.0